# import libraries here; add more as necessary
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
# Import train_test_split
from sklearn.cross_validation import train_test_split
from collections import defaultdict
from wordcloud import WordCloud
# magic word for producing visualizations in notebook
%matplotlib inline
What is the data like?
How many numerical and string dtypes do we have?
What is the corelation between Installs, reviews, price, free and paid apps?
What are the categorical data?
df_playstore = pd.read_csv('googleplaystore.csv',delimiter=',')
df_reviews = pd.read_csv('googleplaystore_user_reviews.csv',delimiter=',')
df_playstore.head()
print (df_playstore.shape)
print (df_playstore.info())
print (df_playstore.columns)
Find unique values and counts of it per column to analyze the data preparation tasks ahead. Find which kinds of values are found and dtypes of each column.
for col in df_playstore.columns:
print (df_playstore[col].value_counts())
df_reviews.head()
Define -
Data cleaning steps- df_playstore
1. Drop duplicates in 'App' column googleplaystore
2. Remove "M" & "k" from Size column. Convert value to MB and dtype to float."Varies with device" would be NaN. Replace 1,000+ with 1.
3. Remove "+" from Installs column. Convert to int.
4. Remove single row in playstore dataframe where Category = 1.9.
5. Replace zero 0 with Free in column Type. Convert to float
6. Remove $ sign from Price column and convert to float
7. Genres - split on ; keep second but remove the first part
8. Last Updated - change dtype to datetime and time
9. Current Ver - do nothing as of now.
Android Ver - split and remove " and up" string... If time allows.!
df_playstore.columns
# Clean data for App column
df_playstore.drop_duplicates(['App'],keep='first',inplace=True)
# Drop row from Category column where df_playstore['Category'] == '1.9'
#Convert the index value to list then pass it to drop function.
df_playstore.drop(list(df_playstore.loc[df_playstore['Category'] == '1.9'].index),axis=0,inplace=True)
# Clean data for Size column
df_playstore['Size'].replace('Varies with device',np.nan,inplace=True)
df_playstore['Size'] = df_playstore['Size'].map(lambda x: str(x)[:-1] if 'M' in str(x) else x)
df_playstore['Size'] = df_playstore['Size'].map(lambda x: str(x)[:-1] if '+' in str(x) else x)
df_playstore['Size'] = df_playstore['Size'].apply(lambda x: float(str(x).replace('k', '')) / 1000 if 'k' in str(x) else x)
df_playstore['Size'].replace('1,000','1',inplace=True)
df_playstore['Size'] = df_playstore['Size'].astype('float')
df_playstore['Size'].dtype
# Clean data for Installs column
df_playstore['Installs'] = df_playstore['Installs'].map(lambda x: str(x)[:-1] if '+' in str(x) else x)
df_playstore['Installs'] = df_playstore['Installs'].map(lambda x : str(x).replace(',','') if ',' in str(x) else x)
df_playstore['Installs'].replace('Free','0',inplace=True)
df_playstore['Installs'] = df_playstore['Installs'].astype('int')
df_playstore['Installs'].dtype
# Clean data for Ratings column
df_playstore['Rating'] = df_playstore['Rating'].astype('float')
# Change dtype for Reviews column
df_playstore['Reviews'] = df_playstore['Reviews'].astype('int')
df_playstore['Reviews'].dtype
#Data cleaning and wrangling for Price column
df_playstore['Price'] = df_playstore['Price'].map(lambda x : str(x).replace('$','') if '$' in str(x) else x)
df_playstore.rename(columns={'Price':'Price_in_dollars'},inplace=True)
df_playstore['Price_in_dollars'] = df_playstore['Price_in_dollars'].astype('float')
print(df_playstore['Price_in_dollars'].dtype)
#Split values of Genres columns by ';' and group appropriatly
print (df_playstore['Genres'].shape)
df_playstore['Genres'] = df_playstore['Genres'].map(lambda x : str(x).rsplit(';')[0] if ';' in str(x) else x)
print (df_playstore['Genres'].shape[0])
print (df_playstore['Genres'].dtype)
# Change dtype of Last Updated to date time.
df_playstore['Last Updated'] = pd.to_datetime(df_playstore['Last Updated'],dayfirst=True)
1. Find and replace missing values columnwise
2. Find and replace missing values rowwise.
#Find and visualize missing values columnwise.
missing_data = df_playstore.isnull().sum()
missing_data = (missing_data[missing_data>0]/df_playstore.shape[0]) * 100
missing_data.sort_values(inplace=True)
missing_data.plot.bar(title = 'Column wise percentage missing', figsize=(6,3))
#Fill NaN values with mean of respective columns. Nan values are replaced with respective mean values because it would
#introduce minimum bias in the data.
#np.where(np.isnan(df_playstore))
df_playstore['Rating'].fillna(df_playstore['Rating'].mean(),inplace=True)
df_playstore['Size'].fillna(df_playstore['Size'].mean(),inplace=True)
#Plot after replacing Nan values with respective mean() values of that column.
missing_data = df_playstore.isnull().sum()
missing_data = (missing_data[missing_data>0]/df_playstore.shape[0]) * 100
missing_data.sort_values(inplace=True)
missing_data.plot.bar(title = 'Column wise percentage missing', figsize=(6,3))
In the above chart there are three columns - Type, Android Version and Current Version which still has NaN or missing values but it would not be replaced with any values because I would not use these columns for modelling or analysis purposes. I shall drop these columns from my analysis dataset.
Type column would be one hot encoded later in the stage.
df_playstore.head()
#find and drop rows (axis=1) with missing values
missing_data = df_playstore.isnull().sum(axis=1)
missing_data = (missing_data[missing_data>0]/df_playstore.shape[0]) * 100
missing_data.sort_values(inplace=True)
missing_data.plot.bar(title = 'Column wise percentage missing', figsize=(6,3))
print (df_playstore.iloc[7333])
In above step, I have not deleted rows with missing values as the rows still have important information for many columns. All numeric columns do have missing values. I would handle NAN values during scaling and modelling if I face any errors.
Once the data is cleaned and usable, plot scatter and bar chart to visualize data. Data understanding is combined with data cleaning preparing activities before putting it for scaling and modelling.
#Scatterplot Matrix from seaborn
x = df_playstore['Rating'].dropna()
y = df_playstore['Size'].dropna()
z = df_playstore['Installs'][df_playstore.Installs!=0].dropna()
p = df_playstore['Reviews'][df_playstore.Reviews!=0].dropna()
t = df_playstore['Type'].dropna()
price = df_playstore['Price_in_dollars']
p = sns.pairplot(pd.DataFrame(list(zip(x, y, np.log(z), np.log10(p), t, price)),
columns=['Rating','Size', 'Installs', 'Reviews', 'Type', 'Price_in_dollars']), hue='Type', palette="Set2")
Plot categorical column 'Category' to visualize number of Apps available per category in the dataset. This chart would find its place in the blog.
print (len(df_playstore['Category'].value_counts()))
df_playstore['Category'].value_counts().plot.bar(title = 'Number of Categorical Apps', figsize=(30,20),fontsize=18)
Data Preprocessing- One hot encode and Label encode the categorical columns. Then scale down all numerical columns using standard scaler function.
Using getdummies on 'Type' column would add 0s and 1s values in two columns which would into add a lot of bias. After scaling, the values would still easily be between 0 and 1. This is a simple categorical column and do not have lots of labels so getdummies is best suitable here.
Using Label encoding on Category column would mark all 33 different columns with unique ids. This would add some bias though but scaling should normalize the values. It would be interesting to see how Linear Regression model weighs on Category column.
Columns like Genres and versions are not useful for modelling hence I would drop them before scaling later.
Reference links- https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/
https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn
Categories = df_playstore['Category'].unique()
Categories
df_playstore.head()
# getdummies encoding for Type column
df_playstore_temp=pd.get_dummies(df_playstore,prefix='Type',columns=['Type'])
values = np.array(df_playstore_temp['Category'].unique())
values
# #label encoding Categorical values
# label_encoder = LabelEncoder()
# integer_encoded = label_encoder.fit_transform(values)
# print (integer_encoded)
# #One hot encoding categorical values
# onehot_encoder = OneHotEncoder(sparse=False)
# integer_encoded = integer_encoded.reshape(len(integer_encoded),1)
# onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
# print (onehot_encoded)
# ##Inverse transform one hot encoded
# # inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0,:])])
# # print(inverted)
#Label encode Category column in dataframe
labelencoder = LabelEncoder()
df_playstore_temp['Category'] = labelencoder.fit_transform(df_playstore_temp['Category'])
df_playstore_temp['Content Rating'] = labelencoder.fit_transform(df_playstore_temp['Content Rating'])
#drop the columns which are not required for scaling and modelling.
df_playstore_temp.drop(columns=['App','Genres','Last Updated','Current Ver','Android Ver'],axis=0,inplace=True)
df_playstore_temp.head()
Final check and updates on missing values before applying scaling. If there are NaN values then scaling would fail. Ideally there should'nt be any NaN or missing values from the data cleaning steps above. May be removing the missing value rows could help.
#fill-in column level mean values and replace nan uniformity in the dataframe.
df_playstore_temp.fillna(df_playstore_temp.mean(),inplace=True)
#Make all values numeric in dataframe before scaling
df_playstore_temp.apply(pd.to_numeric)
#find any nan values still present after replacing nan with column mean value.
np.where(np.isnan(df_playstore_temp))
# Apply feature scaling on all values to the entire numerical dataframe.
scaled_features = StandardScaler().fit_transform(df_playstore_temp.values)
df_scaled_features = pd.DataFrame(scaled_features,index=df_playstore_temp.index,columns = df_playstore_temp.columns)
df_scaled_features.describe()
#function to apply PCA feature scaling
def scree_plot(pca):
'''
Creates a scree plot associated with the principal components
INPUT: pca - the result of instantian of PCA in scikit learn
OUTPUT:
None
'''
num_components = len(pca.explained_variance_ratio_)
ind = np.arange(num_components)
vals = pca.explained_variance_ratio_
plt.figure(figsize=(25, 10))
ax = plt.subplot(111)
cumvals = np.cumsum(vals)
ax.bar(ind, vals)
ax.plot(ind, cumvals)
print (ind, cumvals)
for i in range(num_components):
ax.annotate(r"%s%%" % ((str(vals[i]*100)[:4])), (ind[i]+0.2, vals[i]), va="bottom", ha="center", fontsize=12)
ax.xaxis.set_tick_params(width=0)
ax.yaxis.set_tick_params(width=2, length=12)
ax.set_xlabel("Principal Component")
ax.set_ylabel("Variance Explained (%)")
plt.title('Explained Variance Per Principal Component')
# Apply PCA to the data with for all features
pca = PCA(n_components=9)
pca_scaled_features = pca.fit_transform(scaled_features)
scree_plot(pca)
# Map weights for the first principal component to corresponding feature names
# and then print the linked values, sorted by weight.
def sorted_weights(pca, ix, dataset):
"""
Docstring- map the weights and components from PCA analysis.
Input parameters-
Input - pca initialized model
ix = index number of first set of components.
dataset = a dataframe of scaled features
"""
a1 = pca.components_[ix]
a2 = dataset.keys().values
a = list(zip(a1, a2))
a.sort(key=lambda tup: tup[0])
return a
sorted_weights(pca,1,df_scaled_features)
sorted_weights(pca,2,df_scaled_features)
sorted_weights(pca,3,df_scaled_features)
NOTE -
PCA analysis shows that 8 components have variance between 23.4% to 4.15% which would affect the prediction of outcomes.
As thumb rule, positive or negative variance of component above 0.5 is deemed to affect the preduction. Hence in first set of features selection, Reviews and Installation numbers play a big role in predicting which category of apps would recieve reviews and higher number of installations.
In the second set of components, Category, Installs, Size and Content Rating variance is inverserly proportional. It implies that Size of App and number of installs are dependent and customers prefer to install low size apps.
Third set of components shows App Price and ratings have strong inverse relationship. Customer who pay who provide higher Ratings install apps which are cheaper.
KMean clustering of dataset shows that there are three clusters of data in majority. I have used KMeans clustering to separate clusters of data within the dataset based on PCA analysis but it seems that the data is not clearly separable and hands around 3 major clusters.
def plot_data(data, labels):
'''
Plot data with colors associated with labels
'''
fig = plt.figure();
ax = Axes3D(fig)
ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=labels, cmap='tab10');
#Evaluate unsupervised learning methods on the data with 8 clusters.
#Initiate KMeans and plot possible clusters
kmeans_pop = KMeans(n_clusters=15)
model_pop = kmeans_pop.fit(pca_scaled_features)
labels_pop = model_pop.predict(pca_scaled_features)
plot_data(pca_scaled_features,labels_pop)
#Evaluate unsupervised learning methods on the data with 8 clusters.
#Initiate KMeans and plot possible clusters
kmeans_pop = KMeans(n_clusters=7)
model_pop = kmeans_pop.fit(pca_scaled_features)
labels_pop = model_pop.predict(pca_scaled_features)
plot_data(pca_scaled_features,labels_pop)
df_playstore_temp.head()
Use Linear Regression model to predict the following business questions-
1. Predict Ratings of apps for all categories.
2. Predict Pricing of apps.
3. Predict pricing in relation to number of installations
Case One- Target Label = 'Rating'
#Split the dataset into features and target labels. Rating column is the target label.
Target_label = df_playstore_temp['Rating']
features_label = df_playstore_temp.drop('Rating',axis=1)
# Split the features into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_label,Target_label,test_size = 0.2,random_state = 0)
# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))
#Train using LinearRegression model
lm_model = LinearRegression(normalize=True)
# If our model works, it should just fit our model to the data. Otherwise, it will let us know.
try:
lm_model.fit(X_train, y_train)
except:
print("Oh no! It doesn't work!!!")
y_test_preds = lm_model.predict(X_test)# Predictions here
r2_test = r2_score(y_test, y_test_preds) # Rsquared here
# Print r2 to see result
print('R Squared value of the predicted labels: ' + str(r2_test))
coefficients = list(zip(lm_model.coef_,X_test))
#print(coefficients)
print ('Mean Squared Error: '+ str(metrics.mean_squared_error(y_test,y_test_preds)))
print ('Mean absolute Error: '+ str(metrics.mean_absolute_error(y_test,y_test_preds)))
print ('Mean squared Log Error: '+ str(metrics.mean_squared_log_error(y_test,y_test_preds)))
# Linear regression coefficients for Ratings as target labels.
coefficients
# Plot a chart for linear regression
plt.figure(figsize=(12,7))
sns.regplot(y_test_preds,y_test,color='teal',marker = 'x')
#sns.regplot(y_test_preds1,y_test1,color='orange')
plt.legend()
plt.title('Linear Regression model- App Ratings')
plt.xlabel('Predicted Ratings')
plt.ylabel('Actual Ratings')
plt.show()
Case Two- Target Label = 'Price_in_dollars'
#Split the dataset into features and target labels. Rating column is the target label.
Target_label1 = df_scaled_features['Price_in_dollars']
features_label1 = df_scaled_features.drop('Price_in_dollars',axis=1)
# Split the features into training and testing sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(features_label1,Target_label1,test_size = 0.2,random_state = 0)
# Show the results of the split
print("Training set has {} samples.".format(X_train1.shape[0]))
print("Testing set has {} samples.".format(X_test1.shape[0]))
#Train using LinearRegression model
lm_model1 = LinearRegression(normalize=True)
# If our model works, it should just fit our model to the data. Otherwise, it will let us know.
try:
lm_model1.fit(X_train1, y_train1)
except:
print("Oh no! It doesn't work!!!")
y_test_preds1 = lm_model1.predict(X_test1)# Predictions here
r2_test1 = r2_score(y_test1, y_test_preds1) # Rsquared here
# Print r2 to see result
coefficients1 = list(zip(lm_model1.coef_,X_test1))
print('R Squared value of the predicted labels: ' + str(r2_test1))
print ('Mean Squared Error: '+ str(metrics.mean_squared_error(y_test1,y_test_preds1)))
print ('Mean absolute Error: '+ str(metrics.mean_absolute_error(y_test1,y_test_preds1)))
#print ('Mean squared Log Error: '+ str(metrics.mean_squared_log_error(y_test1,y_test_preds1)))
# Linear regression coefficients for Price as target labels.
coefficients1
# Plot a chart for linear regression
plt.figure(figsize=(12,7))
#sns.regplot(y_test_preds,y_test,color='teal',marker = 'x')
#sns.regplot(y_test_preds1,y_test1,color='orange')
sns.regplot(y_test1,y_test_preds1,color='teal',marker='x')
plt.legend()
plt.title('Linear Regression model- predicted price vs actual price')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.show()
Case Three- Target Label = 'Installs'
#Split the dataset into features and target labels.'Installs'column is the target label.
Target_label2 = df_scaled_features['Installs']
features_label2 = df_scaled_features.drop('Installs',axis=1)
# Split the features into training and testing sets
X_train2, X_test2, y_train2, y_test2 = train_test_split(features_label2,Target_label2,test_size = 0.2,random_state = 0)
# Show the results of the split
print("Training set has {} samples.".format(X_train2.shape[0]))
print("Testing set has {} samples.".format(X_test2.shape[0]))
#Train using LinearRegression model
lm_model2 = LinearRegression(normalize=True)
# If our model works, it should just fit our model to the data. Otherwise, it will let us know.
try:
lm_model2.fit(X_train2, y_train2)
except:
print("Oh no! It doesn't work!!!")
#Predict function
y_test_preds2 = lm_model2.predict(X_test2)# Predictions here
#R square function
r2_test2 = r2_score(y_test2, y_test_preds2) # Rsquared here
# Print r2 to see result
coefficients2 = list(zip(lm_model2.coef_,X_test2))
print('R Squared value of the predicted labels: ' + str(r2_test2))
print ('Mean Squared Error: '+ str(metrics.mean_squared_error(y_test2,y_test_preds2)))
print ('Mean absolute Error: '+ str(metrics.mean_absolute_error(y_test2,y_test_preds2)))
#print ('Mean squared Log Error: '+ str(metrics.mean_squared_log_error(y_test2,y_test_preds2)))
coefficients2
# Plot a chart for linear regression
plt.figure(figsize=(12,7))
sns.regplot(y_test_preds1,y_test1,color='teal',marker = 'x')
sns.regplot(y_test_preds2,y_test2,color='orange')
plt.legend()
plt.title('Linear Regression model- Predicted Price vs Installs')
plt.xlabel('Predicted Installs')
plt.ylabel('Predicted Price')
plt.show()
Evaluate the mean squared error MSE-
Case One- Target Label = 'Rating' (1.3819674820377472e-08, 'Reviews'), (1.0710294006119369e-10,'Installs')
with 'Rating' as target label, model has predicted with much higher coefficients of Reviews and Installs clearly because of high biases which does not show a correct picture.Category values should have been weighed higher - (-0.0018024997071665536, 'Category'). To me this looks like a misprediction due to biased values.
Case Two- Target Label = 'Price_in_dollars' - Mean Squared Error: 1.9064629909990272, MSE is not closer to 0 and highest in comparision to other target labels but the coefficients are equally distributed and it seems that there is no bias introduced during prediction. With maximum value of coefficient (0.19030847314185675, 'Type_Paid'), Linear Regression defines Price has an important feature for App selection. Which sould rational too.
Case Three- Target Label = 'Installs' - Mean Squared Error: 0.4897947759940874, MSE is close to Zero which is a good sign and coefficients have correctly identified the weights of features as (0.6627852226270038, 'Reviews')(0.025157112715526838, 'Category'). This shows that Linear Regression model could predict success of an App better if the target label is set to Installs.
It is even interesting to see corelation between predicted price and installs.
Predicted Price vs Predicted Installs
Linear Regression model shows that predicted price and number of installs are proportional and tends to increase together. (Small Blue line is predicted price and the large Orange line is predicted Installs). Form Violin plot at the end it would be clear that for certain categories of Apps on playstore, consumers pay for the App.
"An MSE of zero, meaning that the estimator predicts observations of the parameter with perfect accuracy, is the ideal, but is typically not possible."
Reference link- https://en.wikipedia.org/wiki/Mean_squared_error
Apps in cateogories like Events, lifestyle, Games, Books and libraries have free to highest prices. Whereas Books and references, Dating, Finance, Health and Fitness, Gaming, Family, Medical apps have lower prices.
Apps in categories like Game, Lifestyle, Family, Medical, Travel and local, productivity, Health and fitness get paid for with higher number of installations.
subset_df = df_playstore[df_playstore.Category.isin(df_playstore['Category'].unique())]
sns.set_style('darkgrid')
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
p = sns.stripplot(x="Price_in_dollars", y="Category", data=subset_df, jitter=True, linewidth=1)
title = ax.set_title('App pricing trend across all categories')
subset_df = df_playstore[df_playstore.Category.isin(df_playstore['Category'].unique())]
sns.set_style('darkgrid')
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
p = sns.stripplot(x="Installs", y="Category", data=subset_df, jitter=True, linewidth=1)
title = ax.set_title('App Installation trend across all categories')
Reviews and Installs have strong corelation which is rational in the given dataset.
#Corelation plot for numerical encoded version of playstore data.
sns.heatmap(df_playstore_temp.corr(),annot=True,fmt='.2f')
Histogram for scaled and numerical version Playstore dataset.
df_playstore_temp.hist(figsize=(15,10))
Apps which are paid for have got better rating and more number of Installs.
#function with docstring to plot comparision charts between various features of dataset
def func_plotjoint(a,b,color='g'):
"""Function to plot seaborn jointplot chart
Input parameters -
x= x-axis labels with dataframe column
y = y-axis labels with dataframe column
color = 'r' for red, 'c' for cyan, default is green
"""
sns.jointplot(x=a,y=b,color=color)
#Plot charts between different important features from modelling and PCA analysis.
func_plotjoint(df_playstore['Installs'],df_playstore['Rating'])
func_plotjoint(df_playstore['Price_in_dollars'],df_playstore['Rating'],color='r')
func_plotjoint(df_playstore['Installs'],df_playstore['Price_in_dollars'],color='c')
groups = df_playstore.groupby('Category').filter(lambda x: len(x) >= 50).reset_index()
print('Average rating = ', np.nanmean(list(groups.Rating)))
c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 720, len(set(groups.Category)))]
layout = {'title' : 'App rating distribution and probability density for all categories',
'xaxis': {'tickangle':-40},
'yaxis': {'title': 'Rating'},
'plot_bgcolor': 'rgb(250,250,250)',
'shapes': [{
'type' :'line',
'x0': -.5,
'y0': np.nanmean(list(groups.Rating)),
'x1': 19,
'y1': np.nanmean(list(groups.Rating)),
'line': { 'dash': 'dashdot'}
}]
}
data = [{
'y': df_playstore.loc[df_playstore.Category==category]['Rating'],
'type':'violin',
'name' : category,
'showlegend':False,
#'marker': {'color': 'Set2'},
} for i,category in enumerate(list(set(groups.Category)))]
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()
plotly.offline.iplot({'data': data, 'layout': layout})
Average rating for all App Categories
This chart shows median value of Ratings per Category together with Max Rating received and minimum Rating. This chart also shows combination of ratings distribution and probability density per category.
Tools, Productivity, Finance, Category- has range of ratings from 1 to 5
Assuming that Rating is equivalent to popularity, and width and length of curved area of the graph implies that Comics, Health & Fitness,Parenting, Art and Design,
Reference- https://en.wikipedia.org/wiki/Violin_plot https://en.wikipedia.org/wiki/Kernel_density_estimation http://seaborn.pydata.org/generated/seaborn.violinplot.html
groups = df_playstore.groupby('Category').filter(lambda x: len(x) >= 300).reset_index()
print('Average rating = ', np.nanmean(list(groups.Price_in_dollars)))
c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 720, len(set(groups.Category)))]
layout = {'title' : 'App Price distribution and probability density for top 10 categories',
'xaxis': {'tickangle':-40},
'yaxis': {'title': 'Price_in_dollars'},
'plot_bgcolor': 'rgb(250,250,250)',
'shapes': [{
'type' :'line',
'x0': -.5,
'y0': np.nanmean(list(groups.Price_in_dollars)),
'x1': 19,
'y1': np.nanmean(list(groups.Price_in_dollars)),
'line': { 'dash': 'dashdot'}
}]
}
data = [{
'y': df_playstore.loc[df_playstore.Category==category]['Price_in_dollars'],
'type':'violin',
'name' : category,
'showlegend':False,
#'marker': {'color': 'Set2'},
} for i,category in enumerate(list(set(groups.Category)))]
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()
plotly.offline.iplot({'data': data, 'layout': layout})
Define-
1. What do users think about the Apps? Keywords?
2. Sentiment Analysis.
df_reviews.head()
A wordcloud of most frequently used words used in User reviews by the users.
df_reviews['Translated_Review'].replace(np.NaN,'',inplace=True)
review = np.array(df_reviews['Translated_Review'].dropna)
# Create and generate a word cloud image for first 50 dog names:
wordcloud = WordCloud(max_words=100, background_color='white').generate(str(review))
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.figure(figsize=(40,30))
plt.show()
A wordcloud of most frequently used words used for App names.
df_reviews['App'].replace(np.NaN,'',inplace=True)
Appname = np.array(df_reviews.drop_duplicates('App').dropna())
# Create and generate a word cloud image for first 50 dog names:
wordcloud = WordCloud(max_words=200, background_color='white').generate(str(Appname))
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.figure(figsize=(40,30))
plt.show()